{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6846f2a0",
   "metadata": {},
   "source": [
    "## selenium 爬虫 - 木材网\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "044da221",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mac\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "driver_path=''\n",
    "match(os.name):\n",
    "    case \"nt\":\n",
    "        # Windows\n",
    "        driver_path = r\"./driver/chromedriver-win64/chromedriver.exe\"  \n",
    "        print(\"Windows\")\n",
    "    case \"posix\":\n",
    "        # Mac\n",
    "        driver_path = r\"./driver/chromedriver-mac-arm64/chromedriver\"  \n",
    "        print(\"Mac\")\n",
    "\n",
    "os.environ[\"PATH\"] += \";\" + driver_path"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e52d3b2",
   "metadata": {},
   "source": [
    "### 启动 selenium\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f84744ea",
   "metadata": {},
   "source": [
    "#### 启动浏览器 和 打开网站\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "94d3c4bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.chrome.service import Service\n",
    "driver = webdriver.Chrome(service=Service(executable_path=driver_path))\n",
    "driver.get(\"https://www.chinatimber.org/baojia/baojia.html?prov=&bigCate=&q=\")\n",
    "time.sleep(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f40d110a",
   "metadata": {},
   "source": [
    "### 获取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f6a4e47a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据最大页数为： 605\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "from selenium.webdriver.common.by import By\n",
    "# 定位到倒数第二个 <li> 元素 即为最大页数\n",
    "pagination_items = driver.find_elements(By.CSS_SELECTOR, \"ul.pagination li\")\n",
    "max_page_item = pagination_items[-2] \n",
    "# 提取文本\n",
    "max_page = max_page_item.text.strip()\n",
    "print(f\"数据最大页数为： {max_page}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "859496f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "max_page=int(5)\n",
    "reduce_data = []\n",
    "def get_data():\n",
    "    for i in range(1, int(max_page)):\n",
    "        print(f\"正在抓取第{i}页\")\n",
    "        # 定位表格中的行\n",
    "        rows = driver.find_elements(By.CSS_SELECTOR, \"tbody tr\")\n",
    "\n",
    "        # 遍历每一行提取数据\n",
    "        data = []\n",
    "        for row in rows:\n",
    "            columns = row.find_elements(By.TAG_NAME, \"td\")\n",
    "            row_data = [col.text.strip() for col in columns]\n",
    "            data.append(row_data)\n",
    "        # 将数据添加到 reduce_data 列表中\n",
    "        reduce_data.extend(data)\n",
    "       # 翻页部分\n",
    "        try:\n",
    "            pagination_items = driver.find_elements(By.CSS_SELECTOR, \"ul.pagination li\")\n",
    "            next_page_btn = pagination_items[-1].find_element(By.TAG_NAME, \"a\")\n",
    "            next_page_btn.click()\n",
    "            time.sleep(random.randint(1, 3))\n",
    "        except Exception:\n",
    "            return \n",
    "    return reduce_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6939fcb2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正在抓取第1页\n",
      "正在抓取第2页\n",
      "正在抓取第3页\n",
      "正在抓取第4页\n",
      "正在抓取第5页\n"
     ]
    }
   ],
   "source": [
    "form_data=get_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c8ed12e9",
   "metadata": {},
   "source": [
    "# 数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "03e43e93",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>size</th>\n",
       "      <th>grade</th>\n",
       "      <th>location</th>\n",
       "      <th>category</th>\n",
       "      <th>price</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>双面热压拼板机</td>\n",
       "      <td>6200-1300-180mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>110000元/台</td>\n",
       "      <td>2024-05-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>木工热压机</td>\n",
       "      <td>2500-1300-42mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>110000元/台</td>\n",
       "      <td>2024-05-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>可调速单板剪切机</td>\n",
       "      <td>2700-60mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>23000元/台</td>\n",
       "      <td>2024-05-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>双面A字形拼方机</td>\n",
       "      <td>6200-1300-150mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>110000元/台</td>\n",
       "      <td>2024-05-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>各种规格木业裁皮机</td>\n",
       "      <td>2700-60mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>23000元/台</td>\n",
       "      <td>2024-05-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>南美菠萝格板材</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>进口</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>4000元/m3</td>\n",
       "      <td>2023-02-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>柳桉全红板材</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>进口</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>3000元/m3</td>\n",
       "      <td>2023-02-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>菠萝格全红板材</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>进口</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>7000元/m3</td>\n",
       "      <td>2023-02-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>巴蒂木地板</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>巴西</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>4000元/m3</td>\n",
       "      <td>2023-02-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>巴蒂木板材</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>巴西</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>3000元/m3</td>\n",
       "      <td>2023-02-08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         name             size grade location category      price        date\n",
       "0     双面热压拼板机  6200-1300-180mm    优等       山东     板材原木  110000元/台  2024-05-15\n",
       "1       木工热压机   2500-1300-42mm    优等       山东     板材原木  110000元/台  2024-05-15\n",
       "2    可调速单板剪切机        2700-60mm    优等       山东     板材原木   23000元/台  2024-05-15\n",
       "3    双面A字形拼方机  6200-1300-150mm    优等       山东     板材原木  110000元/台  2024-05-15\n",
       "4   各种规格木业裁皮机        2700-60mm    优等       山东     板材原木   23000元/台  2024-05-15\n",
       "..        ...              ...   ...      ...      ...        ...         ...\n",
       "95    南美菠萝格板材             规格可订    优等       进口     板材原木   4000元/m3  2023-02-08\n",
       "96     柳桉全红板材             规格可订    优等       进口     板材原木   3000元/m3  2023-02-08\n",
       "97    菠萝格全红板材             规格可订    优等       进口     板材原木   7000元/m3  2023-02-08\n",
       "98      巴蒂木地板             规格可订    优等       巴西     板材原木   4000元/m3  2023-02-08\n",
       "99      巴蒂木板材             规格可订    优等       巴西     板材原木   3000元/m3  2023-02-08\n",
       "\n",
       "[100 rows x 7 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# 转化为 DataFrame\n",
    "columns = ['name', 'size', 'grade', 'location', 'category', 'price', 'date']\n",
    "df = pd.DataFrame(form_data, columns=columns)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c1fa0693",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 移除grade为空的行\n",
    "df = df[df['grade'] != '']\n",
    "\n",
    "# 清洗日期转换为时间戳\n",
    "df['date'] = pd.to_datetime(df['date'])\n",
    "df['date'] = df['date'].astype(int) // 10**9\n",
    "\n",
    "# 清洗grade 把 'a' 替换成 'A级'\n",
    "df['grade'] = df['grade'].apply(lambda x: x.replace('a', 'A级') if x == 'a' else x)\n",
    "\n",
    "# 匹配 '*等品' 的替换成\t'*等'\n",
    "df['grade'] = df['grade'].apply(lambda x: x.replace('等品', '等') if '等品' in x else x)\n",
    "\n",
    "# 匹配仅有一个 '' 的替换成\t'A级'\n",
    "df['grade'] = df['grade'].apply(lambda x: x.replace('A', 'A级') if x == 'A' else x)\n",
    "\n",
    "# 清洗仅有 '1' 的替换成 '一级'\n",
    "df['grade'] = df['grade'].apply(lambda x: x.replace('1', '一级') if x == '1' else x)\n",
    "\n",
    "# 从price列提取 '0.45元/棵' 的'棵'，并分组统计\n",
    "df['type'] = df['price'].apply(lambda x: x.split('/')[1])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e6bc95fb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "grade\n",
       "优等         35\n",
       "A级         21\n",
       "AAA        20\n",
       "统材          6\n",
       "FAS         3\n",
       "B级          3\n",
       "C级          3\n",
       "特等          2\n",
       "无节材         2\n",
       "AA          1\n",
       "通货级         1\n",
       "带二接头通货级     1\n",
       "一等          1\n",
       "AB          1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 统计grade的种类然后列出来\n",
    "df['grade'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4c88d5db",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "location\n",
       "国产           20\n",
       "山东           14\n",
       "进口           14\n",
       "东南亚          10\n",
       "缅甸            6\n",
       "非洲            6\n",
       "欧洲            5\n",
       "南美洲           5\n",
       "东北牡丹江         4\n",
       "湖南            3\n",
       "北美洲           3\n",
       "南美            2\n",
       "巴西            2\n",
       "贵州            2\n",
       "老挝            2\n",
       "南&#16047;     1\n",
       "巴新            1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 统计location的种类然后列出来\n",
    "df['location'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "822ef0a2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "name\n",
       "楸木木皮       7\n",
       "水曲柳木皮      5\n",
       "金丝柚        4\n",
       "节松木皮       3\n",
       "南美菠萝格      2\n",
       "          ..\n",
       "鹅掌揪        1\n",
       "美国小叶樱桃木    1\n",
       "老挝大红酸枝木    1\n",
       "山樟         1\n",
       "巴蒂木板材      1\n",
       "Name: count, Length: 79, dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 统计name的种类然后列出来\n",
    "df['name'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "fa18ef34",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "type\n",
       "m3    59\n",
       "m2    19\n",
       "台     14\n",
       "吨      8\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 统计type的种类然后列出来\n",
    "df['type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "05040d9c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>size</th>\n",
       "      <th>grade</th>\n",
       "      <th>location</th>\n",
       "      <th>category</th>\n",
       "      <th>price</th>\n",
       "      <th>date</th>\n",
       "      <th>type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>双面热压拼板机</td>\n",
       "      <td>6200-1300-180mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>110000元/台</td>\n",
       "      <td>1715731200</td>\n",
       "      <td>台</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>木工热压机</td>\n",
       "      <td>2500-1300-42mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>110000元/台</td>\n",
       "      <td>1715731200</td>\n",
       "      <td>台</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>可调速单板剪切机</td>\n",
       "      <td>2700-60mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>23000元/台</td>\n",
       "      <td>1715731200</td>\n",
       "      <td>台</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>双面A字形拼方机</td>\n",
       "      <td>6200-1300-150mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>110000元/台</td>\n",
       "      <td>1715731200</td>\n",
       "      <td>台</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>各种规格木业裁皮机</td>\n",
       "      <td>2700-60mm</td>\n",
       "      <td>优等</td>\n",
       "      <td>山东</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>23000元/台</td>\n",
       "      <td>1715731200</td>\n",
       "      <td>台</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>南美菠萝格板材</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>进口</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>4000元/m3</td>\n",
       "      <td>1675814400</td>\n",
       "      <td>m3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>柳桉全红板材</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>进口</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>3000元/m3</td>\n",
       "      <td>1675814400</td>\n",
       "      <td>m3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>菠萝格全红板材</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>进口</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>7000元/m3</td>\n",
       "      <td>1675814400</td>\n",
       "      <td>m3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>巴蒂木地板</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>巴西</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>4000元/m3</td>\n",
       "      <td>1675814400</td>\n",
       "      <td>m3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>巴蒂木板材</td>\n",
       "      <td>规格可订</td>\n",
       "      <td>优等</td>\n",
       "      <td>巴西</td>\n",
       "      <td>板材原木</td>\n",
       "      <td>3000元/m3</td>\n",
       "      <td>1675814400</td>\n",
       "      <td>m3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         name             size grade location category      price        date  \\\n",
       "0     双面热压拼板机  6200-1300-180mm    优等       山东     板材原木  110000元/台  1715731200   \n",
       "1       木工热压机   2500-1300-42mm    优等       山东     板材原木  110000元/台  1715731200   \n",
       "2    可调速单板剪切机        2700-60mm    优等       山东     板材原木   23000元/台  1715731200   \n",
       "3    双面A字形拼方机  6200-1300-150mm    优等       山东     板材原木  110000元/台  1715731200   \n",
       "4   各种规格木业裁皮机        2700-60mm    优等       山东     板材原木   23000元/台  1715731200   \n",
       "..        ...              ...   ...      ...      ...        ...         ...   \n",
       "95    南美菠萝格板材             规格可订    优等       进口     板材原木   4000元/m3  1675814400   \n",
       "96     柳桉全红板材             规格可订    优等       进口     板材原木   3000元/m3  1675814400   \n",
       "97    菠萝格全红板材             规格可订    优等       进口     板材原木   7000元/m3  1675814400   \n",
       "98      巴蒂木地板             规格可订    优等       巴西     板材原木   4000元/m3  1675814400   \n",
       "99      巴蒂木板材             规格可订    优等       巴西     板材原木   3000元/m3  1675814400   \n",
       "\n",
       "   type  \n",
       "0     台  \n",
       "1     台  \n",
       "2     台  \n",
       "3     台  \n",
       "4     台  \n",
       "..  ...  \n",
       "95   m3  \n",
       "96   m3  \n",
       "97   m3  \n",
       "98   m3  \n",
       "99   m3  \n",
       "\n",
       "[100 rows x 8 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94999323",
   "metadata": {},
   "source": [
    "## 入库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "50415ad3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "\n",
    "# 连接到 SQLite 数据库（如果数据库不存在会自动创建）\n",
    "conn = sqlite3.connect('timber_data.db')\n",
    "cursor = conn.cursor()\n",
    "\n",
    "# 创建表格\n",
    "cursor.execute('''\n",
    "CREATE TABLE IF NOT EXISTS timber_data (\n",
    "\tname TEXT,\n",
    "\tsize TEXT,\n",
    "\tgrade TEXT,\n",
    "\tlocation TEXT,\n",
    "\tcategory TEXT,\n",
    "\tprice TEXT,\n",
    "\tdate TEXT,\n",
    "\ttype TEXT\n",
    ")\n",
    "''')\n",
    "\n",
    "# 将 DataFrame 数据插入到表格中\n",
    "for row in df.itertuples(index=False):\n",
    "\tcursor.execute('''\n",
    "\tINSERT INTO timber_data (name, size, grade, location, category, price, date,\ttype)\n",
    "\tVALUES (?, ?, ?, ?, ?, ?, ?,\t?)\n",
    "\t''', row)\n",
    "\n",
    "# 提交事务并关闭连接\n",
    "conn.commit()\n",
    "conn.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cefcbdd5",
   "metadata": {},
   "source": [
    "# ==============================================================="
   ]
  },
  {
   "cell_type": "markdown",
   "id": "432b2f3f",
   "metadata": {},
   "source": [
    "# 自动化全流程爬取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e2170267",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mac\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<sqlite3.Cursor at 0x10c972840>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将下载的浏览器驱动文件解压并放在一个好记的文件夹中\n",
    "import os\n",
    "driver_path=''\n",
    "match(os.name):\n",
    "    case \"nt\":\n",
    "        # Windows\n",
    "        driver_path = r\"./driver/chromedriver-win64/chromedriver.exe\"  \n",
    "        print(\"Windows\")\n",
    "    case \"posix\":\n",
    "        # Mac\n",
    "        driver_path = r\"./driver/chromedriver-mac-arm64/chromedriver\"  \n",
    "        print(\"Mac\")\n",
    "\n",
    "os.environ[\"PATH\"] += \";\" + driver_path\n",
    "\n",
    "import sqlite3\n",
    "# 连接到 SQLite 数据库（如果数据库不存在会自动创建）\n",
    "conn = sqlite3.connect('timber_data.db')\n",
    "cursor = conn.cursor()\n",
    "# 创建表格\n",
    "cursor.execute('''\n",
    "CREATE TABLE IF NOT EXISTS timber_data (\n",
    "\tname TEXT,\n",
    "\tsize TEXT,\n",
    "\tgrade TEXT,\n",
    "\tlocation TEXT,\n",
    "\tcategory TEXT,\n",
    "\tprice TEXT,\n",
    "\tdate TEXT,\n",
    "\ttype TEXT\n",
    ")\n",
    "''')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b5eb9338",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.chrome.service import Service\n",
    "driver = webdriver.Chrome(service=Service(executable_path=driver_path))\n",
    "driver.get(\"https://www.chinatimber.org/baojia/baojia.html?prov=&bigCate=&q=\")\n",
    "time.sleep(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42853b08",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import random\n",
    "from selenium.webdriver.common.by import By\n",
    "# 定位到倒数第二个 <li> 元素 即为最大页数\n",
    "pagination_items = driver.find_elements(By.CSS_SELECTOR, \"ul.pagination li\")\n",
    "max_page_item = pagination_items[-2] \n",
    "# 提取文本\n",
    "max_page = max_page_item.text.strip()\n",
    "print(f\"数据最大页数为： {max_page}\")\n",
    "max_page=int(max_page)\n",
    "reduce_data = []\n",
    "def get_data():\n",
    "    for i in range(1, int(max_page) + 1):\n",
    "        print(f\"正在抓取第{i}页\")\n",
    "        # 定位表格中的行\n",
    "        rows = driver.find_elements(By.CSS_SELECTOR, \"tbody tr\")\n",
    "\n",
    "        # 遍历每一行提取数据\n",
    "        data = []\n",
    "        for row in rows:\n",
    "            columns = row.find_elements(By.TAG_NAME, \"td\")\n",
    "            row_data = [col.text.strip() for col in columns]\n",
    "            data.append(row_data)\n",
    "\n",
    "        # 转化为 DataFrame\n",
    "        columns = ['name', 'size', 'grade', 'location', 'category', 'price', 'date']\n",
    "        df = pd.DataFrame(data, columns=columns)\n",
    "        df = df[df['grade'] != '']  # 移除 grade 为空的行\n",
    "        \n",
    "        # 数据清洗逻辑...\n",
    "        # 清洗日期转换为时间戳\n",
    "        df['date'] = pd.to_datetime(df['date'], errors='coerce')\n",
    "        df = df.dropna(subset=['date'])\n",
    "        df['date'] = df['date'].astype(int) // 10**9\n",
    "\n",
    "        # 清洗 grade\n",
    "        df['grade'] = df['grade'].apply(lambda x: x.replace('a', 'A级') if x == 'a' else x)\n",
    "        df['grade'] = df['grade'].apply(lambda x: x.replace('等品', '等') if '等品' in x else x)\n",
    "        df['grade'] = df['grade'].apply(lambda x: x.replace('A', 'A级') if x == 'A' else x)\n",
    "        df['grade'] = df['grade'].apply(lambda x: x.replace('1', '一级') if x == '1' else x)\n",
    "        df['type'] = df['price'].apply(lambda x: x.split('/')[1] if '/' in x else '无单位')\n",
    "\n",
    "\n",
    "        # 插入数据到数据库\n",
    "        insert_data(df)\n",
    "\n",
    "        # 翻页部分\n",
    "        try:\n",
    "            pagination_items = driver.find_elements(By.CSS_SELECTOR, \"ul.pagination li\")\n",
    "            next_page_btn = pagination_items[-1].find_element(By.TAG_NAME, \"a\")\n",
    "            next_page_btn.click()\n",
    "            time.sleep(random.randint(1, 3))\n",
    "        except Exception:\n",
    "            return \n",
    "        \n",
    "def insert_data(df):\n",
    "    for _, row in df.iterrows():\n",
    "        cursor.execute('''\n",
    "        INSERT INTO timber_data (name, size, grade, location, category, price, date, type)\n",
    "        VALUES (?, ?, ?, ?, ?, ?, ?, ?)\n",
    "        ''', tuple(row))  # 将每一行数据转为元组传入\n",
    "    conn.commit()  # 提交事务\n",
    "\n",
    "\n",
    "\n",
    "form_data=get_data()\n",
    "form_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "f90a8d43",
   "metadata": {},
   "outputs": [],
   "source": [
    "conn.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
